# if (!require("BiocManager", quietly = TRUE))
#     install.packages("BiocManager")
# 
# BiocManager::install("scran")

suppressPackageStartupMessages({
    library(Seurat)
    library(ggplot2) # plotting
    library(patchwork) # combining figures
    library(dplyr)
    library(xlsx)
})

1 Import data

data.filt = readRDS("results/data.filt.RDS")
data.filt
## An object of class Seurat 
## 33515 features across 1022 samples within 1 assay 
## Active assay: RNA (33515 features, 2000 variable features)
##  3 layers present: counts, data, scale.data
##  2 dimensional reductions calculated: pca, umap

2 Select highly variable genes

# selection.method = "vst": i.e. seurat_v3.
suppressWarnings(suppressMessages(data.filt <- FindVariableFeatures(data.filt, selection.method = "vst", 
                                                  nfeatures = 2000, verbose = FALSE, assay = "RNA")))

top20 <- head(VariableFeatures(data.filt), 20)

LabelPoints(plot = VariableFeaturePlot(data.filt), points = top20, repel = TRUE)

3 Scaling and regress out unwanted variation

Scale the data, here, we can regress out some unwanted variation, but first we need to identify those unwanted variation

data.filt <- ScaleData(data.filt, assay = "RNA")
data.filt <- RunPCA(data.filt, npcs = 50, verbose = F)
## $Plot contribution of metadata to each PCs
print("Plot contribution of metadata to each PCs...")
dummy = SingleCellExperiment::SingleCellExperiment(
  list(pc_space=t(data.filt@reductions$pca@cell.embeddings[, 1:10])), 
  colData=data.filt@meta.data[, c("nCount_RNA", "nFeature_RNA", "percent_mito", 
                                  "percent_ribo", "S.Score", "G2M.Score", "Phase")])
explan_pcs = scater::getVarianceExplained(dummy, exprs_values="pc_space")

scater::plotExplanatoryPCs(explan_pcs/100)
## [1] "Plot contribution of metadata to each PCs..."

From the plot above, we can see that, nFeature_RNA, nCount_RNA, percent_ribo, Phase, S.Score, percent.mit, G2M.Score explain more than 1% of the varaince, we need to regress them out duing scaling

  • nFeature_RNA and nCount_RNA are highly positively correlated, we just need to regress out one of them
  • We already know from last tutorial Phase is not accurate, so we will regres out S.Score and G2M.Score instead. Though sometimes, one may want to keep the cell cycle info in the data.
  • percent_ribo often contain useful biological information, we can consider removing ribosomal protein genes, but not regress them out
  • For sure, we can regress out percent.mit, because it is often correlated with cell quality

Scaling and regress out unwanted variation at the same time

data.filt <- ScaleData(data.filt, assay = "RNA", vars.to.regress = c("nFeature_RNA", "S.Score", "G2M.Score", "percent.mit"))

4 Dimensional Reduction: PCA

data.filt <- RunPCA(data.filt, npcs = 50, verbose = F)

# run ?RunPCA to check for more parameters
# By default, RunPCA will use the highly variable features for building PCA
wrap_plots(
    DimPlot(data.filt, reduction = "pca", group.by = "orig.ident", dims = 1:2),
    DimPlot(data.filt, reduction = "pca", group.by = "orig.ident", dims = 3:4),
    DimPlot(data.filt, reduction = "pca", group.by = "orig.ident", dims = 5:6),
    ncol = 3
) + plot_layout(guides = "collect")

# Visualize the gene loadings on each PCs.
VizDimLoadings(data.filt, dims = 1:5, reduction = "pca", ncol = 5, balanced = T)

Mainly the top PCs contain useful information, the rest of the PCs (principal components) may mostly contain random noises. In Seurat, the tSNE and UMAP analyses below are based on the PCA result, we will only select the top significant PCs and feed them to tSNE and UMAP.

How do we select the top PCs?

A common method for determining the number of PCs to be retained is a graphical representation known as an elbow plot. An elbow plot is a simple line segment plot that shows the eigenvalues (i.e. variability explained) for each individual PC. It shows the eigenvalues on the y-axis and the number of PCs on the x-axis. It always displays a downward curve. Most elbow plots look broadly similar in shape, starting high on the left, falling rather quickly, and then flattening out at some point. This is because the first component usually explains much of the variability, the next few components explain a moderate amount, and the latter components only explain a small fraction of the overall variability. The elbow plot criterion looks for the “elbow” in the curve and selects all components just before the line flattens out.

ElbowPlot(data.filt, reduction = "pca", ndims = 50)

Here, 10 looks a good number to keep, but we usually keep a bit more than that to make sure we get most of the useful information, Let’s keep 15 instead.

5 Dimensional reduction: tSNE

data.filt <- RunTSNE(
    data.filt,
    reduction = "pca", dims = 1:15,
    perplexity = 30, # low perplexity: finer structure
    max_iter = 1000,
    theta = 0.5,
    eta = 200,
    num_threads = 0
)

# run ?RunTSNE to see how to adjust parameters.
DimPlot(data.filt, reduction = "tsne", group.by = "orig.ident",
        pt.size=0.1)
# run ?DimPlot to see how to adjust parameters. 

What does each dot represent in the sSNE plot?

6 Dimensional reduction: UMAP

data.filt <- RunUMAP(
    data.filt,
    reduction = "pca",
    dims = 1:15,
    n.components = 2,
    n.neighbors = 30,
    n.epochs = 200,
    min.dist = 0.3,
    learning.rate = 1,
    spread = 1
)

# Run ?RunUMAP to see how to adjust the parameters

# Larger values will result in more global structure being preserved at the loss of detailed local structure. In general this parameter should often be in the range 5 to 50.
# min_dist: The minimum distance between two points in the UMAP embedding.
# spread: A scaling factor for distance between embedded points.
DimPlot(data.filt, reduction = "umap", group.by = "orig.ident") + ggplot2::ggtitle(label = "UMAP_on_PCA")

How different it is between a tSNE plot and a UMAP plot?

Let’s plot all three dimensional reduciton plots together

wrap_plots(
    DimPlot(data.filt, reduction = "pca", group.by = "orig.ident"),
    DimPlot(data.filt, reduction = "tsne", group.by = "orig.ident"),
    DimPlot(data.filt, reduction = "umap", group.by = "orig.ident"),
    ncol = 3
) + plot_layout(guides = "collect")

Apart from that, UMAP can also be built on the selected highly variable genes and on Graph.

Markers Cell Type
CD3E T cells
CD3E CD4 CD4+ T cells
CD3E CD8A CD8+ T cells
GNLY, NKG7 NK cells
MS4A1 B cells
CD14, LYZ, CST3, MS4A7 CD14+ Monocytes
FCGR3A, LYZ, CST3, MS4A7 FCGR3A+ Monocytes
FCER1A, CST3 DCs

Plot the marker genes on the UMAP plot:

myfeatures <- c("CD3E", "CD4", "CD8A", "NKG7", "GNLY", 
  "MS4A1", "CD14", "LYZ", "MS4A7", "FCGR3A", "CST3", "FCER1A")
FeaturePlot(data.filt, reduction = "umap", dims = 1:2, 
            features = myfeatures, ncol = 4, order = T) +
            NoLegend() + NoAxes() + NoGrid()

Based on the marker gene distribution, can you guess approximately (using manual annotation) what cell types the cells belong to.

7 Automatic cell type annotation

For demonstration, here, we will only run one cell&reference-based automatic annotation, called label transfer (From Seurat). But, instead of CCA, which is the default for the FindTransferAnchors() function, we will use pcaproject, ie; the query dataset is projected onto the RPCA of the reference dataset. Then, the labels of the reference data will be transfered to sufficiently similar ones in the query dataset.

  • CCA-based integration therefore enables integrative analysis when experimental conditions or disease states introduce very strong expression shifts, or when integrating datasets across modalities and species. However, CCA-based integration may also lead to overcorrection, especially when a large proportion of cells are non-overlapping across datasets.

  • RPCA-based integration runs significantly faster, and also represents a more conservative approach where cells in different biological states are less likely to ‘align’ after integration. We therefore recommend RPCA during integrative analysis where:

  • A substantial fraction of cells in one dataset have no matching type in the other

  • Datasets originate from the same platform (i.e. multiple lanes of 10x genomics)

  • There are a large number of datasets or cells to integrate (see here for more tips on integrating large datasets)

First, download the reference data

# devtools::install_github("immunogenomics/harmony")
# devtools::install_github("powellgenomicslab/scPred")
reference <- scPred::pbmc_1
reference
## An object of class Seurat 
## 32838 features across 3500 samples within 1 assay 
## Active assay: RNA (32838 features, 0 variable features)
##  2 layers present: counts, data

Here, we will run all the steps that we did in previous labs in one go with the pipe-operator %>%.

reference <- reference %>%
    NormalizeData() %>%
    FindVariableFeatures() %>%
    ScaleData() %>%
    RunPCA(verbose = F) %>%
    RunUMAP(dims = 1:15)
transfer.anchors <- FindTransferAnchors(
    reference = reference, query = data.filt,
    dims = 1:15
)
predictions <- TransferData(
    anchorset = transfer.anchors, refdata = reference$cell_type,
    dims = 1:15
)
data.filt <- AddMetaData(object = data.filt, metadata = predictions)
wrap_plots(
    DimPlot(reference, reduction = "umap", group.by = "cell_type"),
    DimPlot(data.filt, reduction = "umap", group.by = "predicted.id"),
    ncol = 2
) + plot_layout(guides = "collect")

For the two figures above, the one to the left is the reference dataset, while the one to the right is the predicted lables in the query dataset. Do you think they more or less agree with your preliminary manual annotation?

  • cMono: classical monocyte, i.e. CD14+ monocyte
  • ncMono: non-classical monocyte.

For other automatic annotation methods, please see my lectures.

8 Clustering

# First we need to build SNN Graph
data.filt <- FindNeighbors(data.filt,
    reduction = "pca",
    assay = "RNA",
    k.param = 20,
    features = VariableFeatures(data.filt)
)

# SNN was built based on pca as well as the highly variable features.
# run ?FindNeighbors to see how to adjust the parameters

# Clustering with louvain (algorithm 1) and a few different resolutions
for (res in c(0.1, 0.25, .5, 1, 1.5, 2, 2.5, 3, 3.5)) {
    data.filt <- FindClusters(data.filt, graph.name = "RNA_snn", resolution = res, algorithm = 1)
}

# ?FindClusters 
# each time you run clustering, the data is stored in meta data columns:
# seurat_clusters - lastest results only
# snn_res.XX - for each different resolution you test.
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 1022
## Number of edges: 29820
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9511
## Number of communities: 6
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 1022
## Number of edges: 29820
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9102
## Number of communities: 7
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 1022
## Number of edges: 29820
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8604
## Number of communities: 9
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 1022
## Number of edges: 29820
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.7970
## Number of communities: 12
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 1022
## Number of edges: 29820
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.7453
## Number of communities: 13
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 1022
## Number of edges: 29820
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.6996
## Number of communities: 14
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 1022
## Number of edges: 29820
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.6594
## Number of communities: 14
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 1022
## Number of edges: 29820
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.6219
## Number of communities: 16
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 1022
## Number of edges: 29820
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.5879
## Number of communities: 17
## Elapsed time: 0 seconds

Plot the clustering results

wrap_plots(
    DimPlot(data.filt, reduction = "umap", group.by = "RNA_snn_res.0.1") + ggtitle("louvain_0.1"),
    DimPlot(data.filt, reduction = "umap", group.by = "RNA_snn_res.0.25") + ggtitle("louvain_0.25"),
    DimPlot(data.filt, reduction = "umap", group.by = "RNA_snn_res.0.5") + ggtitle("louvain_0.5"),
    DimPlot(data.filt, reduction = "umap", group.by = "RNA_snn_res.1") + ggtitle("louvain_1"),
    DimPlot(data.filt, reduction = "umap", group.by = "RNA_snn_res.1.5") + ggtitle("louvain_1.5"),
    DimPlot(data.filt, reduction = "umap", group.by = "RNA_snn_res.2") + ggtitle("louvain_2"),
    DimPlot(data.filt, reduction = "umap", group.by = "RNA_snn_res.2.5") + ggtitle("louvain_2.5"),
    DimPlot(data.filt, reduction = "umap", group.by = "RNA_snn_res.3") + ggtitle("louvain_3"),
    DimPlot(data.filt, reduction = "umap", group.by = "RNA_snn_res.3.5") + ggtitle("louvain_3.5"),
    ncol = 3
)

Which clustering resolution give a result that mostly agrees with out annotation? It’s hard to see! It looks that resolution 0.1 holds the highest similarity. Let’s check the quality of the cells first:

feats <- c("nFeature_RNA", "nCount_RNA", "percent_mito", "percent_ribo", "percent_hb", "percent_plat")
FeaturePlot(data.filt, features=feats, ncol=3)

Nothing obviously wrong!

What should we do? which one to choose! I recommend that at this stage, talk to your PI, and ask which version make the most biological sense.

During this process, try to get the differential genes for two or three sets of clusters you feel hard to choose from, and compare the differential genes and see with the help of more marker genes from the differential tables, whether you can decide on one that makes the most sense.

9 Differential gene analysis

For demonstration, here, we will identify the differential genes of the label transfer-predicted cell types

From the differential gene results, we can check whether the marker genes for a particular celltype will appear in the differentially expressed gene list for the corresponding cluster.

One can also use differential analysis for exploring new marker genes.

Wilcoxon rank-sum is a non-parametrical differential analysis method. You might also consider much more powerful differential testing packages like MAST, LR, limma, DESeq2.

we will run FindAllMarkers to test one cluster vs the rest, the largest celltype (T cell) will dominate the “rest” and influence the results the most. So it is often a good idea to subsample the clusters to an equal number of cells before running differential expression for one vs the rest. So lets select 30 cells per cell type here:

# Check cell number per celltype:
table(data.filt$predicted.id)
data.filt$predicted.id = factor(data.filt$predicted.id, levels=names(table(data.filt$predicted.id)))
# Identify differential genes for each celltype
Idents(data.filt) = data.filt$predicted.id
markers_genes <- FindAllMarkers(
    data.filt,
    log2FC.threshold = 0.2,
    test.use = "wilcox",
    min.pct = 0.1,
    min.diff.pct = 0.2,
    only.pos = TRUE,
    max.cells.per.ident = 30, # downsample each cell type to 30
    assay = "RNA",
    min.cells.group = 1
)

# Run ?FindAllMarkers to adjust for parameters!

# Filter the marker genes
markers_genes = markers_genes %>% filter(p_val_adj<0.1)
# Check the acquired number of marker genes per cell type:
table(markers_genes$cluster)
## 
##      B cell  CD4 T cell  CD8 T cell         cDC       cMono      ncMono 
##         202          76         523           4          93          30 
##     NK cell         pDC Plasma cell 
##          92           1           1 
## 
##      B cell  CD4 T cell  CD8 T cell         cDC       cMono      ncMono 
##          28           6          27         142         107         221 
##     NK cell         pDC Plasma cell 
##          12         289         168

Plot the top marker genes:

markers_genes %>%
    group_by(cluster) %>%
    slice_min(p_val_adj, n = 5, with_ties = FALSE) -> top5_sub

DotPlot(data.filt, features = rev(as.character(unique(top5_sub$gene))), 
        group.by = "predicted.id", assay = "RNA") + #coord_flip() +
        theme(axis.text.x = 
               element_text(angle = 30, vjust = 1, hjust=1))

Save the differential gene table into an excel file and now you can make some UMAP plots and take them to your PI and discuss.

write.xlsx(as.data.frame(markers_genes), 
            "results/DEGs_predicted.id.xlsx",
             col.names=TRUE, row.names=FALSE)

10 Save data

saveRDS(data.filt, "results/data.analyzed.RDS")

11 Recap

  • Select highly variable genes
  • Scaling and regress out unwanted variation
  • Dimensional reduction: PCA
  • Dimensional reduction: tSNE
  • Dimensional reduction: UMAP
  • Clustering
  • Differential gene analysis

12 Session info

sessionInfo()
## R version 4.2.2 (2022-10-31)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: macOS Big Sur ... 10.16
## 
## Matrix products: default
## BLAS/LAPACK: /Users/ekol-yal/miniconda3/envs/seurat5/lib/libopenblasp-r0.3.21.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] xlsx_0.6.5         dplyr_1.1.0        patchwork_1.1.2    ggplot2_3.4.1     
##  [5] Seurat_5.0.1       SeuratObject_5.0.1 sp_1.6-0           captioner_2.2.3   
##  [9] bookdown_0.33      knitr_1.42        
## 
## loaded via a namespace (and not attached):
##   [1] utf8_1.2.3                  spatstat.explore_3.1-0     
##   [3] reticulate_1.28             tidyselect_1.2.0           
##   [5] htmlwidgets_1.6.1           grid_4.2.2                 
##   [7] BiocParallel_1.32.5         Rtsne_0.16                 
##   [9] pROC_1.18.0                 munsell_0.5.0              
##  [11] ScaledMatrix_1.6.0          codetools_0.2-19           
##  [13] ica_1.0-3                   future_1.31.0              
##  [15] miniUI_0.1.1.1              withr_2.5.0                
##  [17] spatstat.random_3.1-4       colorspace_2.1-0           
##  [19] progressr_0.13.0            Biobase_2.58.0             
##  [21] highr_0.10                  rstudioapi_0.14            
##  [23] stats4_4.2.2                SingleCellExperiment_1.20.1
##  [25] ROCR_1.0-11                 tensor_1.5                 
##  [27] rJava_1.0-6                 listenv_0.9.0              
##  [29] MatrixGenerics_1.10.0       labeling_0.4.2             
##  [31] harmony_1.2.0               GenomeInfoDbData_1.2.9     
##  [33] polyclip_1.10-4             farver_2.1.1               
##  [35] parallelly_1.34.0           vctrs_0.5.2                
##  [37] generics_0.1.3              ipred_0.9-13               
##  [39] xfun_0.37                   timechange_0.2.0           
##  [41] R6_2.5.1                    GenomeInfoDb_1.34.9        
##  [43] ggbeeswarm_0.7.2            rsvd_1.0.5                 
##  [45] bitops_1.0-7                spatstat.utils_3.0-2       
##  [47] cachem_1.0.7                DelayedArray_0.24.0        
##  [49] promises_1.2.0.1            scales_1.2.1               
##  [51] nnet_7.3-18                 beeswarm_0.4.0             
##  [53] gtable_0.3.1                beachmat_2.14.2            
##  [55] globals_0.16.2              goftest_1.2-3              
##  [57] spam_2.10-0                 timeDate_4022.108          
##  [59] rlang_1.0.6                 splines_4.2.2              
##  [61] lazyeval_0.2.2              ModelMetrics_1.2.2.2       
##  [63] spatstat.geom_3.1-0         yaml_2.3.7                 
##  [65] reshape2_1.4.4              abind_1.4-5                
##  [67] httpuv_1.6.9                caret_6.0-93               
##  [69] lava_1.7.2.1                tools_4.2.2                
##  [71] ellipsis_0.3.2              jquerylib_0.1.4            
##  [73] RColorBrewer_1.1-3          BiocGenerics_0.44.0        
##  [75] ggridges_0.5.4              Rcpp_1.0.10                
##  [77] plyr_1.8.8                  sparseMatrixStats_1.10.0   
##  [79] zlibbioc_1.44.0             purrr_1.0.1                
##  [81] RCurl_1.98-1.10             rpart_4.1.19               
##  [83] deldir_1.0-6                pbapply_1.7-0              
##  [85] viridis_0.6.2               cowplot_1.1.1              
##  [87] S4Vectors_0.36.2            zoo_1.8-11                 
##  [89] SummarizedExperiment_1.28.0 ggrepel_0.9.3              
##  [91] cluster_2.1.4               magrittr_2.0.3             
##  [93] data.table_1.14.8           RSpectra_0.16-1            
##  [95] scattermore_1.2             lmtest_0.9-40              
##  [97] RANN_2.6.1                  fitdistrplus_1.1-11        
##  [99] matrixStats_0.63.0          xlsxjars_0.6.1             
## [101] mime_0.12                   evaluate_0.20              
## [103] xtable_1.8-4                fastDummies_1.7.3          
## [105] IRanges_2.32.0              gridExtra_2.3              
## [107] compiler_4.2.2              scater_1.26.1              
## [109] tibble_3.1.8                KernSmooth_2.23-20         
## [111] htmltools_0.5.4             later_1.3.0                
## [113] tidyr_1.3.0                 lubridate_1.9.2            
## [115] MASS_7.3-58.2               Matrix_1.6-5               
## [117] cli_3.6.0                   gower_1.0.1                
## [119] parallel_4.2.2              dotCall64_1.1-1            
## [121] igraph_1.4.1                GenomicRanges_1.50.2       
## [123] pkgconfig_2.0.3             recipes_1.0.5              
## [125] plotly_4.10.1               scuttle_1.8.4              
## [127] spatstat.sparse_3.0-1       foreach_1.5.2              
## [129] hardhat_1.2.0               vipor_0.4.7                
## [131] bslib_0.4.2                 XVector_0.38.0             
## [133] prodlim_2019.11.13          stringr_1.5.0              
## [135] digest_0.6.31               sctransform_0.4.1          
## [137] RcppAnnoy_0.0.20            spatstat.data_3.0-1        
## [139] rmarkdown_2.20              leiden_0.4.3               
## [141] uwot_0.1.14                 DelayedMatrixStats_1.20.0  
## [143] shiny_1.7.4                 lifecycle_1.0.3            
## [145] nlme_3.1-162                jsonlite_1.8.4             
## [147] BiocNeighbors_1.16.0        limma_3.54.2               
## [149] viridisLite_0.4.1           fansi_1.0.4                
## [151] pillar_1.8.1                lattice_0.20-45            
## [153] fastmap_1.1.1               httr_1.4.5                 
## [155] survival_3.5-3              glue_1.6.2                 
## [157] png_0.1-8                   iterators_1.0.14           
## [159] presto_1.0.0                class_7.3-21               
## [161] stringi_1.7.12              sass_0.4.5                 
## [163] RcppHNSW_0.4.1              scPred_1.9.2               
## [165] BiocSingular_1.14.0         irlba_2.3.5.1              
## [167] future.apply_1.10.0